rm(list = ls())
source("./functions/fun.R")


# Purpose: 
#  Reshape the CMP/EMP data (step A-B) and integrate (step C) them into a single dataframe. 
#
#
# 
#




#####################
#### STEP A: CMP ####
#####################

# LOAD the data and subset. EU countries only, 


cmp <- read.csv("./datasource/MPDataset_MPDS2016b_est2.csv")



# CONSTRUCT the 16 issue scales (y_{j,l})
######

# Aggregate sub-categories if necessary.
cmp$per202 = rowSums(cmp[,grep("^per202(|.*)",colnames(cmp))], na.rm=T)
cmp$per412 = rowSums(cmp[,grep("^per412(|.*)",colnames(cmp))], na.rm=T)
cmp$per413 = rowSums(cmp[,grep("^per413(|.*)",colnames(cmp))], na.rm=T)
cmp$per401 = rowSums(cmp[,grep("^per401(|.*)",colnames(cmp))], na.rm=T)
cmp$per503 = rowSums(cmp[,grep("^per503(|.*)",colnames(cmp))], na.rm=T)
cmp$per504 = rowSums(cmp[,grep("^per504(|.*)",colnames(cmp))], na.rm=T)
cmp$per607 = rowSums(cmp[,grep("^per607(|.*)",colnames(cmp))], na.rm=T)
cmp$per608 = rowSums(cmp[,grep("^per608(|.*)",colnames(cmp))], na.rm=T)
cmp$per705 = rowSums(cmp[,grep("^per705(|.*)",colnames(cmp))], na.rm=T)
cmp$per601 = rowSums(cmp[,grep("^per601(|.*)",colnames(cmp))], na.rm=T) 

# Reconstruct the original word counts.
p <- round(	(cmp[,grep("per[0-9]{3,4}$",colnames(cmp))]/100) * cmp$total)
p$total <- 	 p$per401 + p$per403 + p$per402 + p$per404+ p$per406 + p$per407 + p$per4012 +p$per412 +p$per413 +p$per503 +p$per504 +p$per505 +p$per506 +p$per507 +p$per4132 +p$per4123 +p$per4011 +p$per4131 +p$per4124 +p$per4013 +p$per409 +p$per410 +p$per407


# Build scales, name them as in König/Luig (2012).
p$lr1 = log(p$per402+0.5) - log(p$per403+0.5)
p$lr2 = log(p$per4012+0.5) - log(p$per404+p$per409+0.5)
p$lr3 = log(p$per407+p$per410+0.5) - log(p$per406+0.5)
p$lr4 = log(p$per401+0.5) - log(p$per412+p$per413+0.5)
p$lr5 = log(p$per505+0.5) - log(p$per503+p$per504+0.5)
p$lr6 = log(p$per507+0.5) - log(p$per506+0.5)
p$lr7 = log(p$per4011+0.5) - log(p$per4132+p$per4123+0.5)
p$lr8 = log(p$per4013+0.5) - log(p$per4131+p$per4124+0.5)

# Attach auxiliary variables.
p$partyname <- cmp$partyname
p$party <- cmp$party
p$countryname <- cmp$countryname
#EC: changed below d with m to fit the new data
p$election <- as.Date(cmp$edate, "%d/%m/%Y")
p$year <- format(p$election, "%Y")
p$month <- format(p$election, "%m")
p$seats <- cmp$absseat


# SELECT and CAST
######

p1 <- p <- ddply(p, c("year", "countryname"), function(x) cbind(x, electno=as.numeric(as.factor(x$month))))
p <- p[p$electno==1,]

# Reshape the dataframe. 
cmplong <- melt(p, id.vars=c("party", "partyname", "countryname" , "year", "seats"), measure.vars=colnames(p)[grep("(lr)|(eu)", colnames(p) )] )
cmpwide <- cast(cmplong, countryname + year + party + seats ~ variable, fun.aggregate=mean)
cmpwide$year <- as.numeric(cmpwide$year)
cmpwide$study <- 1


#####################
#### STEP B: EMP ####
#####################


# LOAD the data. 
######




emp<- read.dta("./datasource/EMP_79_2009_short_east.dta")
emp[,3:13] <- apply(emp[,3:13], 2, function(x) as.numeric(x))
# RECODE 
######

emp$eind_id <- as.character(emp$eind_id)
emp$partyid <- str_extract(emp$eind_id, "[0-9]*")

# Select EU countries and merge sub-national units.
emp$country <- as.character(emp$country)

lr1<-emp$lr1
lr2<-emp$lr2
lr3<-emp$lr3
lr4<-emp$lr4
lr5<-emp$lr5
lr6<-emp$lr6
lr7<-emp$lr7
lr8<-emp$lr8

scales <- cbind(lr1, lr2, lr3, lr4, lr5, lr6, lr7, lr8)

# CAST and SELECT
######

emp <- cbind(emp[,c("eind_id", "country", "partyid", "year", "seats")], scales)
emplong <- melt(emp, id.vars=c("partyid", "country" , "year", "seats"), measure.vars=colnames(emp)[grep("(lr)|(eu)", colnames(emp) )] )
empwide <- cast(emplong, country + year + seats + partyid ~ variable)
colnames(empwide) <- sub("country", "countryname", colnames(empwide))
colnames(empwide) <- sub("partyid", "party", colnames(empwide))


empwide$study <- 2


# CONVERT IDs
# For some reason EMP and CMP IDs differ although it is actually the 
# same party. I changed the EMP IDs to fit with the CMP ID structure.
######

empconvt <- read.csv("./datasource/emp_cmp_converter.csv")
empwide$new <- empconvt$new[match(empwide$party, empconvt$old)]
empwide$new[is.na(empwide$new)] <- empwide$party[is.na(empwide$new)]
empwide$party <- empwide$new
empwide$new <- NULL






#####################################
#### STEP C: INTEGRATE CMP / EMP ####
#####################################


# Who won the election, i.e. gained most seats?
empwide$won <- cmpwide$won <- 0
cmpwide <- ddply(cmpwide, c("countryname", "year"), function(x) {x$won[which.max(x$seats)] <- 1; return(x)} )
empwide <- ddply(empwide, c("year"), function(x) {x$won[which.max(x$seats)] <- 1; return(x)} )


# Note, there are parties in the data that 'rescurected' 
#  We assume a common traj. / path for them.
cmpwide$die <- empwide$die <- 0
cmpwide <- ddply(cmpwide, c("countryname"), partydied ) 
empwide <- partydied(empwide)

# Who gained most after an election? 
cmpwide$seatsdiff <- empwide$seatsdiff <- NA
cmpwide$gain <- empwide$gain <- 0

cmpwide <- ddply(cmpwide, c("party"), seatsdiff ) 
cmpwide <- ddply(cmpwide, c("countryname", "year"), maxgain )

empwide <- ddply(empwide, c("party"), seatsdiff )	
empwide <- ddply(empwide, c("year"),  maxgain)


# Append CMP with the EMP data.
cmpempwide <- rbind(cmpwide,empwide)

##here get rid of an observation that has no values
cmpempwide <- cmpempwide[ which(cmpempwide$countryname!='NA'), ]
cmpempwide$countryname <- as.character(cmpempwide$countryname)



# SAVE #
########

save(cmpempwide, file="cmpempwide_all_east2.Rdata")






